import pandas as pd
import numpy as np
# Load the data
file = 'C:/Users/Lajpat Rai/Desktop/Ryseron Data Science Diploma/CIND820 Project/data.csv'
df = pd.read_csv(file)
df.head() # let' see first 5 rows
| AT | AP | AH | AFDP | GTEP | TIT | TAT | TEY | CDP | CO | NOX | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.5878 | 1018.7 | 83.675 | 3.5758 | 23.979 | 1086.2 | 549.83 | 134.67 | 11.898 | 0.32663 | 81.952 |
| 1 | 4.2932 | 1018.3 | 84.235 | 3.5709 | 23.951 | 1086.1 | 550.05 | 134.67 | 11.892 | 0.44784 | 82.377 |
| 2 | 3.9045 | 1018.4 | 84.858 | 3.5828 | 23.990 | 1086.5 | 550.19 | 135.10 | 12.042 | 0.45144 | 83.776 |
| 3 | 3.7436 | 1018.3 | 85.434 | 3.5808 | 23.911 | 1086.5 | 550.17 | 135.03 | 11.990 | 0.23107 | 82.505 |
| 4 | 3.7516 | 1017.8 | 85.182 | 3.5781 | 23.917 | 1085.9 | 550.00 | 134.67 | 11.910 | 0.26747 | 82.028 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36733 entries, 0 to 36732 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AT 36733 non-null float64 1 AP 36733 non-null float64 2 AH 36733 non-null float64 3 AFDP 36733 non-null float64 4 GTEP 36733 non-null float64 5 TIT 36733 non-null float64 6 TAT 36733 non-null float64 7 TEY 36733 non-null float64 8 CDP 36733 non-null float64 9 CO 36733 non-null float64 10 NOX 36733 non-null float64 dtypes: float64(11) memory usage: 3.1 MB
# identify missing data
missing_data = df.isnull()
# Using a for loop in Python, the method ".value_counts()" counts the number of "True" values.
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
AT False 36733 Name: AT, dtype: int64 AP False 36733 Name: AP, dtype: int64 AH False 36733 Name: AH, dtype: int64 AFDP False 36733 Name: AFDP, dtype: int64 GTEP False 36733 Name: GTEP, dtype: int64 TIT False 36733 Name: TIT, dtype: int64 TAT False 36733 Name: TAT, dtype: int64 TEY False 36733 Name: TEY, dtype: int64 CDP False 36733 Name: CDP, dtype: int64 CO False 36733 Name: CO, dtype: int64 NOX False 36733 Name: NOX, dtype: int64
# print the summary
print(df.describe())
AT AP AH AFDP GTEP \
count 36733.000000 36733.000000 36733.000000 36733.000000 36733.000000
mean 17.712726 1013.070165 77.867015 3.925518 25.563801
std 7.447451 6.463346 14.461355 0.773936 4.195957
min -6.234800 985.850000 24.085000 2.087400 17.698000
25% 11.781000 1008.800000 68.188000 3.355600 23.129000
50% 17.801000 1012.600000 80.470000 3.937700 25.104000
75% 23.665000 1017.000000 89.376000 4.376900 29.061000
max 37.103000 1036.600000 100.200000 7.610600 40.716000
TIT TAT TEY CDP CO \
count 36733.000000 36733.000000 36733.000000 36733.000000 36733.000000
mean 1081.428084 546.158517 133.506404 12.060525 2.372468
std 17.536373 6.842360 15.618634 1.088795 2.262672
min 1000.800000 511.040000 100.020000 9.851800 0.000388
25% 1071.800000 544.720000 124.450000 11.435000 1.182400
50% 1085.900000 549.880000 133.730000 11.965000 1.713500
75% 1097.000000 550.040000 144.080000 12.855000 2.842900
max 1100.900000 550.610000 179.500000 15.159000 44.103000
NOX
count 36733.000000
mean 65.293067
std 11.678357
min 25.905000
25% 57.162000
50% 63.849000
75% 71.548000
max 119.910000
# Pairgrid plot between variables
import seaborn
import matplotlib.pyplot as plt
graph = seaborn.PairGrid(df[['AT','AP','AH','AFDP','GTEP','TIT','TAT','TEY','CDP','CO','NOX']],corner = True)
graph = graph.map_diag(plt.hist)
# type of graph for non-diagonal
graph = graph.map_offdiag(plt.scatter)
# to add legends
graph = graph.add_legend()
# to show
plt.show()
C:\Users\Lajpat Rai\anaconda3\anaconda\lib\site-packages\seaborn\axisgrid.py:156: UserWarning: The handle <matplotlib.patches.Patch object at 0x000001C9F036B220> has a label of '_nolegend_' which cannot be automatically added to the legend. figlegend = self._figure.legend(handles, labels, **kwargs)
# create histogram for each variabel to check normality.
import matplotlib.pyplot as plt
for i in df.columns:
plt.hist(df[i], color = 'g')
plt.title(i)
plt.show()
Dataset is clean and has no missing values. Few parameters are normal except TIT, TEY, TAT, GTEP, CO & AH. Furthemore, Standard devitaion is small for all parameters hence most of data is close to mean. Data will be normalized before splitting it into training & testing data.
# Outliers detection
red_circle = dict(markerfacecolor = 'red', marker = 's', markeredgecolor= 'white')
fig, axs = plt.subplots(1,len(df.columns), figsize = (30,10))
for i, ax in enumerate(axs.flat):
ax.boxplot(df.iloc[:,i], flierprops = red_circle)
ax.set_title(df.columns[i], fontsize = 20, fontweight = 'bold')
ax.tick_params(axis = 'y', labelsize = 20)
plt.tight_layout()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=2, start_cell="bottom-left")
fig.add_trace(go.Box(x=df['AT']),row=1, col=1)
fig.add_trace(go.Box(x=df['TAT']),row=2, col=1)
fig.update_layout(height=700, showlegend=False)
fig.show()
The boxplot shows that more input variables are outliers, so I'll use mean absolute error (MAE) to evaluate the model in modeling. The MAE is not sensitive to the outliers
# let's visualize the correlation thru heatmap
!pip install pandas --upgrade
import matplotlib.pyplot as plt
import seaborn as sns
corr_matrix = df.corr(method = 'pearson')
plt.figure(figsize=(11,9))
dropSelf = np.zeros_like(corr_matrix)
dropSelf[np.triu_indices_from(dropSelf)] = True
sns.heatmap(corr_matrix, cmap=sns.diverging_palette(220, 10, as_cmap=True), annot=True, fmt=".2f", mask=dropSelf)
sns.set(font_scale=1.5)
Requirement already satisfied: pandas in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (1.4.3) Requirement already satisfied: pytz>=2020.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (2021.3) Requirement already satisfied: numpy>=1.18.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (1.21.5) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
The heatmap above shows the correlation between features and output variables(CO and NOx). It is easy to see that some features are negatively correlated each other. For example, the correlation between TIT and CO is -0.71, and it means that when the Turbine Inlet Temperature (TIT) decreases, the gas-turbine engine will produce more CO because a low TIT reduces the efficiency of the gas-turbine engine(look at the figure below).
X = df.drop(columns = ['CO', 'NOX'])
Y_CO = df[['CO']]
Y_NOX = df[['NOX']]
print(X)
print(Y_CO)
AT AP AH AFDP GTEP TIT TAT TEY CDP
0 4.5878 1018.7 83.675 3.5758 23.979 1086.2 549.83 134.67 11.898
1 4.2932 1018.3 84.235 3.5709 23.951 1086.1 550.05 134.67 11.892
2 3.9045 1018.4 84.858 3.5828 23.990 1086.5 550.19 135.10 12.042
3 3.7436 1018.3 85.434 3.5808 23.911 1086.5 550.17 135.03 11.990
4 3.7516 1017.8 85.182 3.5781 23.917 1085.9 550.00 134.67 11.910
... ... ... ... ... ... ... ... ... ...
36728 3.6268 1028.5 93.200 3.1661 19.087 1037.0 541.59 109.08 10.411
36729 4.1674 1028.6 94.036 3.1923 19.016 1037.6 542.28 108.79 10.344
36730 5.4820 1028.5 95.219 3.3128 18.857 1038.0 543.48 107.81 10.462
36731 5.8837 1028.7 94.200 3.9831 23.563 1076.9 550.11 131.41 11.771
36732 6.0392 1028.8 94.547 3.8752 22.524 1067.9 548.23 125.41 11.462
[36733 rows x 9 columns]
CO
0 0.32663
1 0.44784
2 0.45144
3 0.23107
4 0.26747
... ...
36728 10.99300
36729 11.14400
36730 11.41400
36731 3.31340
36732 11.98100
[36733 rows x 1 columns]
#Find the information gain or mutual information of the independent variable with respect to a target variable NOX
import warnings
warnings.simplefilter("ignore")
from sklearn.feature_selection import mutual_info_regression
mi_NOX = mutual_info_regression(X, Y_NOX)
mi_NOX = pd.Series(mi_NOX)
mi_NOX.index = X.columns
mi_NOX.sort_values(ascending=False)
mi_NOX.sort_values(ascending=False).plot.bar(figsize=(10, 4))
<AxesSubplot:>
#Find the information gain or mutual information of the independent variable with respect to a target variable CO
import warnings
warnings.simplefilter("ignore")
from sklearn.feature_selection import mutual_info_regression
mi_CO = mutual_info_regression(X, Y_CO)
mi_CO = pd.Series(mi_CO)
mi_CO.index = X.columns
mi_CO.sort_values(ascending=False)
mi_CO.sort_values(ascending=False).plot.bar(figsize=(10, 4))
<AxesSubplot:>
from sklearn.feature_selection import SelectKBest
sel_five_cols = SelectKBest(mutual_info_regression, k=5)
sel_five_cols.fit(X, Y_CO)
X.columns[sel_five_cols.get_support()]
Index(['AFDP', 'GTEP', 'TIT', 'TEY', 'CDP'], dtype='object')
from sklearn.feature_selection import SelectKBest
sel_five_cols = SelectKBest(mutual_info_regression, k=5)
sel_five_cols.fit(X, Y_NOX)
X.columns[sel_five_cols.get_support()]
Index(['AT', 'GTEP', 'TIT', 'TEY', 'CDP'], dtype='object')
!pip install mlxtend
Requirement already satisfied: mlxtend in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (0.20.0) Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.0.2) Requirement already satisfied: numpy>=1.16.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.21.5) Requirement already satisfied: joblib>=0.13.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.1.0) Requirement already satisfied: setuptools in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (61.2.0) Requirement already satisfied: matplotlib>=3.0.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (3.5.1) Requirement already satisfied: pandas>=0.24.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.4.3) Requirement already satisfied: scipy>=1.2.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.7.3) Requirement already satisfied: python-dateutil>=2.7 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.4) Requirement already satisfied: fonttools>=4.22.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0) Requirement already satisfied: cycler>=0.10 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (0.11.0) Requirement already satisfied: pillow>=6.2.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (9.0.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (1.3.2) Requirement already satisfied: packaging>=20.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (21.3) Requirement already satisfied: pytz>=2020.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas>=0.24.2->mlxtend) (2021.3) Requirement already satisfied: six>=1.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from scikit-learn>=1.0.2->mlxtend) (2.2.0)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
k_features=5,
forward=True,
floating=False,
scoring = 'r2',
cv = 0)
X_train, X_test, y_train_CO, y_test_CO = train_test_split(X, Y_CO, test_size = 0.30)
#Use SFS to select the top 5 features
sfs.fit(X_train, y_train_CO)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (5,) | [0.49154555980999715] | 0.491546 | (TIT,) |
| 2 | (5, 6) | [0.5426697239666705] | 0.54267 | (TIT, TAT) |
| 3 | (2, 5, 6) | [0.5472493402224305] | 0.547249 | (AH, TIT, TAT) |
| 4 | (2, 5, 6, 7) | [0.5483558819190959] | 0.548356 | (AH, TIT, TAT, TEY) |
| 5 | (2, 5, 6, 7, 8) | [0.5499379260261003] | 0.549938 | (AH, TIT, TAT, TEY, CDP) |
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
sfs = SFS(LinearRegression(),
k_features=9,
forward=True,
floating=False,
scoring='r2',
cv=5)
sfs = sfs.fit(X_train, y_train_CO)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
# SFS for depedent variable NOX
X_train, X_test, y_train_NOX, y_test_NOX = train_test_split(X, Y_NOX, test_size = 0.30)
#Use SFS to select the top 5 features
sfs.fit(X_train, y_train_NOX)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (0,) | [0.3085781398449149, 0.30774129376079484, 0.31... | 0.310286 | (AT,) |
| 2 | (0, 4) | [0.33757096687736354, 0.33999106256866596, 0.3... | 0.341815 | (AT, GTEP) |
| 3 | (0, 2, 4) | [0.3674012766765754, 0.3711325441491694, 0.361... | 0.368251 | (AT, AH, GTEP) |
| 4 | (0, 2, 3, 4) | [0.38397353356680997, 0.39322499554317447, 0.3... | 0.384841 | (AT, AH, AFDP, GTEP) |
| 5 | (0, 2, 3, 4, 6) | [0.389435681282967, 0.3981896870544349, 0.3824... | 0.391489 | (AT, AH, AFDP, GTEP, TAT) |
| 6 | (0, 2, 3, 4, 5, 6) | [0.4211516298498681, 0.4278002258922331, 0.411... | 0.424148 | (AT, AH, AFDP, GTEP, TIT, TAT) |
| 7 | (0, 2, 3, 4, 5, 6, 7) | [0.5038012493696647, 0.4970458112953008, 0.493... | 0.499962 | (AT, AH, AFDP, GTEP, TIT, TAT, TEY) |
| 8 | (0, 1, 2, 3, 4, 5, 6, 7) | [0.5157340389697513, 0.515294191608797, 0.5071... | 0.514645 | (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY) |
| 9 | (0, 1, 2, 3, 4, 5, 6, 7, 8) | [0.5159032451514862, 0.5154487744407628, 0.507... | 0.514794 | (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP) |
sfs = SFS(LinearRegression(),
k_features=9,
forward=True,
floating=False,
scoring='r2',
cv=5)
sfs = sfs.fit(X_train, y_train_NOX)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
from sklearn.feature_selection import RFE
#Build a logistic regression model
#model = LinearRegression()
#Define RFE
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
#Use RFE to select the top 5 features
fit = rfe.fit(X_train, y_train_CO)
features = X.columns.to_list()
df_RFE_results = []
for i in range(X.shape[1]):
df_RFE_results.append(
{
'Feature_names': features[i],
'Selected': rfe.support_[i],
'RFE_ranking': rfe.ranking_[i],
}
)
df_RFE_results = pd.DataFrame(df_RFE_results)
df_RFE_results.index.name='Columns'
df_RFE_results
| Feature_names | Selected | RFE_ranking | |
|---|---|---|---|
| Columns | |||
| 0 | AT | False | 4 |
| 1 | AP | True | 1 |
| 2 | AH | False | 2 |
| 3 | AFDP | True | 1 |
| 4 | GTEP | True | 1 |
| 5 | TIT | False | 5 |
| 6 | TAT | False | 3 |
| 7 | TEY | True | 1 |
| 8 | CDP | True | 1 |
#Use RFE to select the top 5 features for target variable NOX
fit = rfe.fit(X_train, y_train_NOX)
features = X.columns.to_list()
df_RFE_results = []
for i in range(X.shape[1]):
df_RFE_results.append(
{
'Feature_names': features[i],
'Selected': rfe.support_[i],
'RFE_ranking': rfe.ranking_[i],
}
)
df_RFE_results = pd.DataFrame(df_RFE_results)
df_RFE_results.index.name='Columns'
df_RFE_results
| Feature_names | Selected | RFE_ranking | |
|---|---|---|---|
| Columns | |||
| 0 | AT | True | 1 |
| 1 | AP | False | 3 |
| 2 | AH | False | 4 |
| 3 | AFDP | False | 2 |
| 4 | GTEP | False | 5 |
| 5 | TIT | True | 1 |
| 6 | TAT | True | 1 |
| 7 | TEY | True | 1 |
| 8 | CDP | True | 1 |
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
k_features=5,
forward=False,
floating=False,
scoring = 'r2',
cv = 0)
#Use SFS to select the top 5 features
sfs.fit(X_train, y_train_CO)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 9 | (0, 1, 2, 3, 4, 5, 6, 7, 8) | [0.00034212702315228327] | 0.000342 | (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP) |
| 8 | (0, 1, 2, 3, 4, 6, 7, 8) | [0.00034146715969340935] | 0.000341 | (AT, AP, AH, AFDP, GTEP, TAT, TEY, CDP) |
| 7 | (1, 2, 3, 4, 6, 7, 8) | [0.00033952330224518157] | 0.00034 | (AP, AH, AFDP, GTEP, TAT, TEY, CDP) |
| 6 | (1, 2, 3, 4, 7, 8) | [0.00033883378229537797] | 0.000339 | (AP, AH, AFDP, GTEP, TEY, CDP) |
| 5 | (1, 2, 4, 7, 8) | [0.0003276362169429836] | 0.000328 | (AP, AH, GTEP, TEY, CDP) |
sfs = SFS(LinearRegression(),
k_features=5,
forward=False,
floating=False,
scoring = 'r2',
cv = 0)
#Use SFS to select the top 5 features
sfs.fit(X_train, y_train_NOX)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 9 | (0, 1, 2, 3, 4, 5, 6, 7, 8) | [0.5155441018083071] | 0.515544 | (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP) |
| 8 | (0, 1, 2, 3, 5, 6, 7, 8) | [0.5155229974796036] | 0.515523 | (AT, AP, AH, AFDP, TIT, TAT, TEY, CDP) |
| 7 | (0, 1, 2, 3, 5, 6, 7) | [0.5153216712663955] | 0.515322 | (AT, AP, AH, AFDP, TIT, TAT, TEY) |
| 6 | (0, 1, 2, 5, 6, 7) | [0.5146116580880893] | 0.514612 | (AT, AP, AH, TIT, TAT, TEY) |
| 5 | (0, 2, 5, 6, 7) | [0.5001254614255183] | 0.500125 | (AT, AH, TIT, TAT, TEY) |
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
k_features=5,
forward=True,
floating=True,
scoring = 'r2',
cv = 0)
#Use SFS to select the top 5 features
sfs.fit(X, Y_CO)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (5,) | [0.49882444817014526] | 0.498824 | (TIT,) |
| 2 | (5, 6) | [0.5507217644680591] | 0.550722 | (TIT, TAT) |
| 3 | (2, 5, 6) | [0.5552174969842719] | 0.555217 | (AH, TIT, TAT) |
| 4 | (2, 5, 6, 7) | [0.55644618886337] | 0.556446 | (AH, TIT, TAT, TEY) |
| 5 | (2, 5, 6, 7, 8) | [0.5581241617573389] | 0.558124 | (AH, TIT, TAT, TEY, CDP) |
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
k_features=5,
forward=True,
floating=True,
scoring = 'r2',
cv = 0)
#Use SFS to select the top 5 features
sfs.fit(X, Y_NOX)
#Create a dataframe for the SFS results
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (0,) | [0.31155773122511365] | 0.311558 | (AT,) |
| 2 | (0, 4) | [0.34261205608496104] | 0.342612 | (AT, GTEP) |
| 3 | (0, 2, 4) | [0.3691172137915859] | 0.369117 | (AT, AH, GTEP) |
| 4 | (0, 2, 3, 4) | [0.3870041880302296] | 0.387004 | (AT, AH, AFDP, GTEP) |
| 5 | (0, 2, 3, 4, 5) | [0.3947387026237553] | 0.394739 | (AT, AH, AFDP, GTEP, TIT) |
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
# Normalize feature vector
X1 = StandardScaler().fit_transform(X)
# Split the dataset
X_train, X_test, y_train_CO, y_test_CO = train_test_split(X1, Y_CO, test_size = 0.30)
# selecting the features
sel = SelectFromModel(RandomForestRegressor(n_estimators = 100,max_features=5))
sel.fit(X_train, y_train_CO)
SelectFromModel(estimator=RandomForestRegressor(max_features=5))
from matplotlib import pyplot
# define the model
model = RandomForestRegressor()
# fit the model
model.fit(X_train, y_train_CO)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar(X.columns, importance)
plt.title("Feature importance using Ridge Model for CO")
pyplot.show()
Feature: 0, Score: 0.04851 Feature: 1, Score: 0.03015 Feature: 2, Score: 0.04312 Feature: 3, Score: 0.05387 Feature: 4, Score: 0.03842 Feature: 5, Score: 0.49951 Feature: 6, Score: 0.22018 Feature: 7, Score: 0.04007 Feature: 8, Score: 0.02617
# Try different numbers of n_estimators
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
model.set_params(n_estimators=n)
model.fit(X_train, y_train_CO)
scores.append(model.score(X_test, y_test_CO))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)
[<matplotlib.lines.Line2D at 0x1c9b2492b50>]
y_pred_CO = model.predict(X_test)
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_CO, y_pred_CO))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_CO, y_pred_CO))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_CO, y_pred_CO)))
Mean Absolute Error: 0.48083399434592605 Mean Squared Error: 1.0347359233139202 Root Mean Squared Error: 1.017219702578514
from matplotlib import pyplot
# define the model
model = RandomForestRegressor()
# fit the model
model.fit(X_train, y_train_NOX)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar(X.columns, importance)
plt.title("Feature importance using Ridge Model for NOX")
pyplot.show()
Feature: 0, Score: 0.12818 Feature: 1, Score: 0.11821 Feature: 2, Score: 0.14085 Feature: 3, Score: 0.12954 Feature: 4, Score: 0.10377 Feature: 5, Score: 0.07823 Feature: 6, Score: 0.11035 Feature: 7, Score: 0.09496 Feature: 8, Score: 0.09589
y_pred_NOX = model.predict(X_test)
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_NOX, y_pred_NOX))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_NOX, y_pred_NOX))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_NOX, y_pred_NOX)))
Mean Absolute Error: 62.78462970727042 Mean Squared Error: 4081.8323665636367 Root Mean Squared Error: 63.889219486261034
# Try different numbers of n_estimators
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
model.set_params(n_estimators=n)
model.fit(X_train, y_train_NOX)
scores.append(model.score(X_test, y_test_NOX))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)
[<matplotlib.lines.Line2D at 0x1c9a724a040>]
from sklearn.linear_model import LassoCV
lcv = LassoCV()
# Split the dataset
X_train, X_test, y_train_CO, y_test_CO = train_test_split(X1, Y_CO, test_size = 0.30, random_state = 0)
lcv.fit(X_train, y_train_CO)
print("Best alpha using built-in LassoCV: %f" % lcv.alpha_)
print("Best score using built-in LassoCV: %f" %lcv.score(X_train,y_train_CO))
coef = pd.Series(lcv.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
Best alpha using built-in LassoCV: 0.001593 Best score using built-in LassoCV: 0.559507 Lasso picked 9 variables and eliminated the other 0 variables
from sklearn.linear_model import Lasso
l1 = Lasso(alpha = 0.001593).fit(X_train, y_train_CO)
#print("Best score using built-in LassoCV: %f" %l1.score(X_train,y_train_CO))
coef = pd.Series(l1.coef_, index = X.columns)
coef
AT -0.255437 AP 0.000267 AH -0.113708 AFDP -0.101240 GTEP 0.381959 TIT -1.343750 TAT -0.577548 TEY -2.249451 CDP 1.470458 dtype: float64
# Lasso Regression with different alpha values
import seaborn as sns
import matplotlib.pyplot as plt
cf = pd.DataFrame()
cf['features'] = X.columns
fig = plt.figure(figsize = (27,30))
i = 0
for a in [0.0001,0.001,0.01,0.5,1]:
l2 = Lasso(alpha = a).fit(X_train, y_train_CO)
l2_predict = l2.predict(X_test)
cf['Lasso Regression'] = np.absolute(l2.coef_.ravel())
sub = fig.add_subplot(2,3,i+1)
plt.subplots_adjust(hspace = 0.3)
sub.set_title('Lasso - ' + str(a), fontsize = 15)
sns.barplot(x = "features", y = "Lasso Regression", data = cf, palette = 'RdYlBu_r')
i = i+1
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model for CO")
Text(0.5, 1.0, 'Feature importance using Lasso Model for CO')
import sklearn.metrics as metrics
from scipy import stats
# Make Prediction
Y_pred_CO = lcv.predict(X_test)
# Return Results
print("The R-square is %0.2f"% lcv.score(X_test, y_test_CO))
print('Lasso Regression MAE: %0.1f'%metrics.mean_absolute_error(y_test_CO,Y_pred_CO))
print('Lasso Regression MSE:%0.1f'%metrics.mean_squared_error(y_test_CO,Y_pred_CO))
print('Lasso Regression RMSE:%0.1f'%np.sqrt(metrics.mean_squared_error(y_test_CO,Y_pred_CO)))
se_lcv = stats.sem(Y_pred_CO)
The R-square is 0.57 Lasso Regression MAE: 0.9 Lasso Regression MSE:2.2 Lasso Regression RMSE:1.5
# Split the dataset
X_train, X_test, y_train_NOX, y_test_NOX = train_test_split(X1, Y_NOX, test_size = 0.30, random_state = 0)
lcv.fit(X_train, y_train_NOX)
print("Best alpha using built-in LassoCV: %f" % lcv.alpha_)
print("Best score using built-in LassoCV: %f" %lcv.score(X_train,y_train_NOX))
coef = pd.Series(lcv.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
Best alpha using built-in LassoCV: 0.006519 Best score using built-in LassoCV: 0.518887 Lasso picked 9 variables and eliminated the other 0 variables
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model for NOX")
Text(0.5, 1.0, 'Feature importance using Lasso Model for NOX')
import sklearn.metrics as metrics
from scipy import stats
# Make Prediction
Y_pred_NOX = lcv.predict(X_test)
# Return Results
print("The R-square is %0.2f"% lcv.score(X_test, y_test_NOX))
print('Lasso Regression MAE: %0.1f'%metrics.mean_absolute_error(y_test_NOX,Y_pred_NOX))
print('Lasso Regression MSE:%0.1f'%metrics.mean_squared_error(y_test_NOX,Y_pred_NOX))
print('Lasso Regression RMSE:%0.1f'%np.sqrt(metrics.mean_squared_error(y_test_NOX,Y_pred_NOX)))
se_lcv = stats.sem(Y_pred_NOX)
The R-square is 0.51 Lasso Regression MAE: 5.8 Lasso Regression MSE:66.0 Lasso Regression RMSE:8.1